###Note: Datasets p53.csv, c2part1.csv and c2part2.csv can be downloaded from http://www.ualberta.ca/~yyasui/homepage.html #tibsh.csv is used for Supporting Text figure and represents the output from SAM Excel Add-In, for input dataset p53.csv #Read data library(qvalue) try1=read.csv(file="G:/Common/Irina/Gene set reduction/c2part1.csv",header = TRUE, sep = ",",fill = T) try2=read.csv(file="G:/Common/Irina/Gene set reduction/c2part2.csv",header = TRUE, sep = ",",fill = T) try=cbind(try1,try2) restrictmapgenes=try[,-1] nofgensets=dim(restrictmapgenes)[2] restrictnofgensets=nofgensets Gender=read.csv("G:/Common/Irina/Gene set reduction/p53.csv",sep=",",header=T) Gender=as.data.frame(Gender) dim(Gender) n1=33 n2=17 n=n1+n2 #read tibsh FDR-values tibsh=read.csv("G:/Common/Irina/Gene set reduction/1000perms/tibsh.csv",sep=",",header=T) tibsh=as.data.frame(tibsh) dim(tibsh) #order by gene name tibsho=tibsh[order(tibsh[,2]),] tryo=try[order(try[,1]),] ###End of reading data ###Run SAM-GS groups=100 samparam=as.integer(dim(Gender)[1]/groups) x=as.matrix(Gender[,3:(2+n1+n2)]) y=c(rep(1,n1),rep(2,n2)) genenames=as.character(Gender[,1]) realsetsize=rep(NA,nofgensets) for(i in 1:nofgensets) { realsetsize[i]=sum(restrictmapgenes[,i]) } restrictsetsize=realsetsize summary(aa[restrictmapgenes[,1]==1,2]) noperm=1000 mysampermlist=matrix(NA,noperm,restrictnofgensets) diperm=matrix(NA,dim(Gender)[1],noperm) permcontrol = matrix(NA,noperm,n1) permcontrol[1,]=c(1:n1) for (i in 2:noperm) {permcontrol[i,] = sample((n1+n2),n1)} meanvar=function(a){ c(mean(a),var(a)) } for (i in 1:noperm){ control=permcontrol[i,] a=rep(2,(n1+n2)) a[control]=1 all=c(1:(n1+n2)) case=all[-control] xmeans1=rowMeans(x[,control]) xmeans2=rowMeans(x[,case]) dif1=x[,control]-xmeans1 dif2=x[,case]-xmeans2 var1=rowSums(dif1^2) var2=rowSums(dif2^2) rr=xmeans1-xmeans2 sum1=var1 sum2=var2 ss=sqrt((1/n1+1/n2)*(sum1+sum2)/(n1+n2-2)) if (i==1) { #calculate s0 ssrr=as.data.frame(cbind(ss,rr)) ssrr=ssrr[order(ssrr[,1]),] samconst=quantile(ssrr[,1],seq(0,1,.05)) tt=matrix(NA,groups,length(samconst)) coeffvar=as.data.frame(cbind(samconst,rep(NA,length(samconst)))) for(j in 1:length(samconst)){ ssrrj=matrix(ssrr[1:samparam*groups,1]/(ssrr[1:samparam*groups,2]+samconst[j]),samparam,groups) tt[,j]=apply(ssrrj,2,mad,constant=.64) coeffvar[j,2]=sqrt(var(tt[,j]))/mean(tt[,j]) } minsamconst=min(samconst[coeffvar[,2]==min(coeffvar[,2])]) if (is.na(minsamconst)) minsamconst=0 } #end calculating s0 mysamout=(rr/(ss+minsamconst))^2 diperm[,i]=rr/(ss+minsamconst) if (i==1) {SAMtlikestat=rr/(ss+minsamconst);} mysampermlist[i,]=t(restrictmapgenes)%*%mysamout } #end for (i in 1:noperm) #calculate gene set p-val mycountlist=rep(0,restrictnofgensets) for(ilist in 1:restrictnofgensets) for (i in 2:noperm){if (mysampermlist[i,ilist]>=mysampermlist[1,ilist]) mycountlist[ilist]=mycountlist[ilist]+1} pval=mycountlist/(noperm-1) qobj=qvalue(pval,lambda=.5) qval=as.data.frame(qobj$qvalue) samvsgsea=as.data.frame(cbind(colnames(restrictmapgenes),restrictsetsize,pval,qval)) colnames(samvsgsea)=c("GS Name","GS Size","GS Pvalue","GS Qval") samvsgsea=samvsgsea[order(pval),] ###End of SAM-GS ###SAM-GSR for pbar cutoff c from .1 to .9: look at gsith gene set in samvsgsea with smallest pval c=seq(.05,.45,.05) simresults=list() redset=list() tibshredset=list() tibshjunk=list() rt=31 #no of rows in results table redsetsizec=matrix(NA,rt,length(c)) out1=as.data.frame(cbind(samvsgsea[1:rt,1:2],matrix(NA,rt,6*length(c)))) out2=as.data.frame(cbind(samvsgsea[1:rt,1:2],matrix(NA,rt,6*length(c)))) #31 with p-val<.01, FDR<.033 for (op in 1:rt){ print(op) gsi=order(pval)[op] digsi=diperm[restrictmapgenes[,gsi]==1,] #select dimatrix for gene set of interest dim(digsi) digsisq=digsi**2 order(-digsisq[,1]) digsisq[4,1:100] odigsisq=digsisq[order(-digsisq[,1]),] odigsisq[1,1:100] #Reduce gene set #calculate gene set red p-val ns=nrow(odigsisq) #set size print(paste("ns",ns)) gsredpval=rep(0,(ns-1)) #pbar gsredpvalR=rep(0,(ns-1)) samgs=rep(NA,ns) for(g in 1:(ns-1)){ for (i in 2:noperm){ if (sum(odigsisq[(g+1):ns,i])>=sum(odigsisq[(g+1):ns,1])) gsredpval[g]=gsredpval[g]+1 #pbar if (sum(odigsisq[1:g,i])>=sum(odigsisq[1:g,1])) gsredpvalR[g]=gsredpvalR[g]+1 samgs[g]=sum(odigsisq[(g+1):dim(odigsisq)[1],1]) } } gsredpval=gsredpval/(noperm-1) #pbar-value gsredpvalR=gsredpvalR/(noperm-1) #p-value #redsetsize based on pbar cutoff for (cit in 1:length(c)){ if (.05>=max(gsredpval)) redsetsizec[op,cit]=ns else redsetsizec[op,cit]=1+sum(c[cit]>=gsredpval) print(redsetsizec[op,cit]) gspval=cbind(gsredpvalR,gsredpval) #simresults[[op]]=gspval redset[[op]]=Gender[restrictmapgenes[,gsi]==1,1][order(-digsisq[,1])][1:(redsetsizec[op,cit])] tibshredset[[op]]=tibsho[tryo[,(1+gsi)]==1,c(2,8)][order(-tibsho[tryo[,(1+gsi)]==1,4]**2),][1:redsetsizec[op,cit],] tibshjunk[[op]]=tibsho[tryo[,(1+gsi)]==1,c(2,8)][order(-tibsho[tryo[,(1+gsi)]==1,4]**2),][-(1:redsetsizec[op,cit]),] out1[op,(6*cit-3):(6*cit-1)]=1/100*summary(tibsho[tryo[,(1+gsi)]==1,c(2,8)][order(-tibsho[tryo[,(1+gsi)]==1,4]**2),][1:redsetsizec[op,cit],2])[c(2,3,5)] out1[op,(6*cit):(6*cit+2)]=1/100*summary(tibsho[tryo[,(1+gsi)]==1,c(2,8)][order(-tibsho[tryo[,(1+gsi)]==1,4]**2),][-(1:redsetsizec[op,cit]),2])[c(2,3,5)] out2[op,(6*cit-3):(6*cit-1)]=1/100*summary(tibsho[tryo[,(1+gsi)]==1,c(2,8)][order(-tibsho[tryo[,(1+gsi)]==1,4]**2),][1:redsetsizec[op,cit],2])[c(1,3,6)] out2[op,(6*cit):(6*cit+2)]=1/100*summary(tibsho[tryo[,(1+gsi)]==1,c(2,8)][order(-tibsho[tryo[,(1+gsi)]==1,4]**2),][-(1:redsetsizec[op,cit]),2])[c(1,3,6)] }#end for cit in 1 to 9 }# end of op iterations #write.table(out1,file="G:/Common/Irina/Gene set reduction/1000perms/July 2007/data fig2.xls",sep="\t",quote=F,col.names=T,row.names=F) #write.table(out1,file="G:/Common/Irina/Gene set reduction/1000perms/July 2007/data fig2a.xls",sep="\t",quote=F,col.names=T,row.names=F) #Prepare data for Supporting Text datafig2=as.data.frame(cbind( c(cbind(rep(.05,6),rep(.1,6),rep(.15,6),rep(.2,6),rep(.25,6),rep(.3,6),rep(.35,6),rep(.4,6),rep(.45,6))) ,t(out1[,-c(1:2)]))) colnames(datafig2)=c("Cutoff", out1[,1]) #Table 2 and Table 3 for (cit in 1:length(c)){ out=as.data.frame(cbind(samvsgsea[1:rt,1:2],matrix(NA,rt,(max(redsetsizec[,cit])+1)))) for (op in 1:rt){ out[op,3]=redsetsizec[op,cit] out[op,4:(redsetsizec[op,cit]+3)]=as.character(redset[[op]][1:redsetsizec[op,cit]]) colnames(out)=c("Gene Set", "Set Size", "Reduced Set Size" ,paste("selected gene",c(1:max(redsetsizec[,cit])))) f1=paste("W:/Common/Irina/Gene set reduction/1000perms/Oct 2007/Corrections/tab2 cutoff",cit,sep="") f2=paste(f1,".xls",sep="") write.table(out,file=f2,sep="\t",quote=F,col.names=T,row.names=F) temp=as.matrix(out[,4:ncol(out)]) tab3=as.data.frame(table(c(temp))) tab3=tab3[order(tab3[,2],decreasing=T),] f1=paste("W:/Common/Irina/Gene set reduction/1000perms/Oct 2007/Corrections/tab3 cutoff",cit,sep="") f2=paste(f1,".xls",sep="") write.table(tab3,file=f2,sep="\t",quote=F,col.names=T,row.names=F) } }# End of Table 2 and Table 3 pdf()#Supporting Text par(mfrow=c(2,2)) for (i in 1:4){ plot(datafig2[seq(2,50,6),1],datafig2[seq(2,50,6),(i+1)],col=2,ylim=c(0,1),xlab="Cutoff c",ylab="FDR summary") title(main=as.character(out1[i,1])) segments(datafig2[seq(1,49,6),1],datafig2[seq(1,49,6),(i+1)],datafig2[seq(3,51,6),1],datafig2[seq(3,51,6),(i+1)],col=2) points(datafig2[seq(5,53,6),1],datafig2[seq(5,53,6),(i+1)],col=3) segments(datafig2[seq(4,52,6),1],datafig2[seq(4,52,6),(i+1)],datafig2[seq(6,54,6),1],datafig2[seq(6,54,6),(i+1)],col=3) } par(mfrow=c(2,2)) for (i in 5:8){ plot(datafig2[seq(2,50,6),1],datafig2[seq(2,50,6),(i+1)],col=2,ylim=c(0,1),xlab="Cutoff c",ylab="FDR summary") title(main=as.character(out1[i,1])) segments(datafig2[seq(1,49,6),1],datafig2[seq(1,49,6),(i+1)],datafig2[seq(3,51,6),1],datafig2[seq(3,51,6),(i+1)],col=2) points(datafig2[seq(5,53,6),1],datafig2[seq(5,53,6),(i+1)],col=3) segments(datafig2[seq(4,52,6),1],datafig2[seq(4,52,6),(i+1)],datafig2[seq(6,54,6),1],datafig2[seq(6,54,6),(i+1)],col=3) } par(mfrow=c(2,2)) for (i in 9:12){ plot(datafig2[seq(2,50,6),1],datafig2[seq(2,50,6),(i+1)],col=2,ylim=c(0,1),xlab="Cutoff c",ylab="FDR summary") title(main=as.character(out1[i,1])) segments(datafig2[seq(1,49,6),1],datafig2[seq(1,49,6),(i+1)],datafig2[seq(3,51,6),1],datafig2[seq(3,51,6),(i+1)],col=2) points(datafig2[seq(5,53,6),1],datafig2[seq(5,53,6),(i+1)],col=3) segments(datafig2[seq(4,52,6),1],datafig2[seq(4,52,6),(i+1)],datafig2[seq(6,54,6),1],datafig2[seq(6,54,6),(i+1)],col=3) } par(mfrow=c(2,2)) for (i in 13:16){ plot(datafig2[seq(2,50,6),1],datafig2[seq(2,50,6),(i+1)],col=2,ylim=c(0,1),xlab="Cutoff c",ylab="FDR summary") title(main=as.character(out1[i,1])) segments(datafig2[seq(1,49,6),1],datafig2[seq(1,49,6),(i+1)],datafig2[seq(3,51,6),1],datafig2[seq(3,51,6),(i+1)],col=2) points(datafig2[seq(5,53,6),1],datafig2[seq(5,53,6),(i+1)],col=3) segments(datafig2[seq(4,52,6),1],datafig2[seq(4,52,6),(i+1)],datafig2[seq(6,54,6),1],datafig2[seq(6,54,6),(i+1)],col=3) } par(mfrow=c(2,2)) for (i in 17:20){ plot(datafig2[seq(2,50,6),1],datafig2[seq(2,50,6),(i+1)],col=2,ylim=c(0,1),xlab="Cutoff c",ylab="FDR summary") title(main=as.character(out1[i,1])) segments(datafig2[seq(1,49,6),1],datafig2[seq(1,49,6),(i+1)],datafig2[seq(3,51,6),1],datafig2[seq(3,51,6),(i+1)],col=2) points(datafig2[seq(5,53,6),1],datafig2[seq(5,53,6),(i+1)],col=3) segments(datafig2[seq(4,52,6),1],datafig2[seq(4,52,6),(i+1)],datafig2[seq(6,54,6),1],datafig2[seq(6,54,6),(i+1)],col=3) } par(mfrow=c(2,2)) for (i in 21:24){ plot(datafig2[seq(2,50,6),1],datafig2[seq(2,50,6),(i+1)],col=2,ylim=c(0,1),xlab="Cutoff c",ylab="FDR summary") title(main=as.character(out1[i,1])) segments(datafig2[seq(1,49,6),1],datafig2[seq(1,49,6),(i+1)],datafig2[seq(3,51,6),1],datafig2[seq(3,51,6),(i+1)],col=2) points(datafig2[seq(5,53,6),1],datafig2[seq(5,53,6),(i+1)],col=3) segments(datafig2[seq(4,52,6),1],datafig2[seq(4,52,6),(i+1)],datafig2[seq(6,54,6),1],datafig2[seq(6,54,6),(i+1)],col=3) } par(mfrow=c(2,2)) for (i in 25:28){ plot(datafig2[seq(2,50,6),1],datafig2[seq(2,50,6),(i+1)],col=2,ylim=c(0,1),xlab="Cutoff c",ylab="FDR summary") title(main=as.character(out1[i,1])) segments(datafig2[seq(1,49,6),1],datafig2[seq(1,49,6),(i+1)],datafig2[seq(3,51,6),1],datafig2[seq(3,51,6),(i+1)],col=2) points(datafig2[seq(5,53,6),1],datafig2[seq(5,53,6),(i+1)],col=3) segments(datafig2[seq(4,52,6),1],datafig2[seq(4,52,6),(i+1)],datafig2[seq(6,54,6),1],datafig2[seq(6,54,6),(i+1)],col=3) } par(mfrow=c(2,2)) for (i in 29:31){ plot(datafig2[seq(2,50,6),1],datafig2[seq(2,50,6),(i+1)],col=2,ylim=c(0,1),xlab="Cutoff c",ylab="FDR summary") title(main=as.character(out1[i,1])) segments(datafig2[seq(1,49,6),1],datafig2[seq(1,49,6),(i+1)],datafig2[seq(3,51,6),1],datafig2[seq(3,51,6),(i+1)],col=2) points(datafig2[seq(5,53,6),1],datafig2[seq(5,53,6),(i+1)],col=3) segments(datafig2[seq(4,52,6),1],datafig2[seq(4,52,6),(i+1)],datafig2[seq(6,54,6),1],datafig2[seq(6,54,6),(i+1)],col=3) } dev.off()#End of Suporting Text #save.image(file="W:/Common/Irina/Gene set reduction/1000perms/Oct 2007/Corrections/p53 example.RData")